library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.1 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.1 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)
here() starts at C:/Users/CodeClan/Desktop/CodeClan/dirty_data_project_Cailean/Task 4
library(janitor)
Attaching package: ‘janitor’
The following objects are masked from ‘package:stats’:
chisq.test, fisher.test
#install.packages("openxlsx")
library(openxlsx)
Initial look at the data
steps: remove unneccesary data (i.e anything not given a Joy, Despair, Meh score or specifically asked for in question) - for 2015, regex to get anything wrapped in “[]” - for 2016, ^ - for 2017, regex Q6 | “1$”
# This section shows my adventure in trying to work out why what i thought would be the same regex pattern produced different results
# only_candy_2015 <- raw_candy_2015 %>%
# select(matches("^[[[:print:]]+]$"))
#
# whats_the_difference <- raw_candy_2015 %>%
# select(matches("^[[][[:print:]]+[]]$"))
#
# whats_the_difference2 <- raw_candy_2015 %>%
# select(matches("^[[]{2}[[:print:]]+[]]{1}$"))
#
# list_a <- names(only_candy_2015)
# list_b <- names(whats_the_difference)
#
# list_a[!(list_a %in% list_b)]
Using regex to select all columns that have text wrapped by square brackets, as these contain all the “candy” data
only_candy_2015 <- raw_candy_2015 %>%
select(matches("^[[]{1}[[:print:]]+[]]{1}$"))
clean_names_candy_2015 <- janitor::clean_names(only_candy_2015)
also cleaning up the names adding year, gender and country columns for joining to other data sets Assigning year = 2015, gender = NA, country = NA
only_personal_data_2015 <- raw_candy_2015 %>%
select(`How old are you?`, `Are you going actually going trick or treating yourself?`) %>%
rename("age" = "How old are you?", "going_trick_or_treating" = "Are you going actually going trick or treating yourself?") %>%
mutate(year = 2015, .before = "age") %>%
mutate(gender = NA, .after = "age") %>%
mutate(country = NA, .after = "gender")
# only_personal_data_2015 %>%
# mutate(trick_or_treating_logical = case_when(
# going_trick_or_treating == "No" ~ FALSE,
# going_trick_or_treating == "Yes" ~ TRUE
#
# ))
rejoining data back together
rejoined_2015 <- bind_cols(only_personal_data_2015, clean_names_candy_2015)
only_candy_2016 <- raw_candy_2016 %>%
select(matches("^[[]{1}[[:print:]]+[]]{1}$"))
clean_names_candy_2016 <- janitor::clean_names(only_candy_2016)
# Comparing column names
# list_a <- names(clean_names_candy_2015)
# list_b <- names(clean_names_candy_2016)
#
# tibble(list_a[!(list_a %in% list_b)])
# tibble(list_b[!(list_b %in% list_a)])
# This converts changes the country column to a less stupid name and standardises the data
# This will not pick up every single variant - However, it picks up most of them
country_tidied_2016 <- raw_candy_2016 %>%
select(`Which country do you live in?`) %>%
mutate(`Which country do you live in?` = tolower(`Which country do you live in?`)) %>%
rename(country = `Which country do you live in?`) %>%
mutate(country = case_when(
country == NA ~ "Other",
str_detect(country, "^us|america|states$") ~ "USA",
str_extract(country, "[u]{1}[s]{1}|[eu]+rica|trump") == TRUE ~ "USA",
str_detect(country, "kingdom$|^uk|england|scotland|britain") ~ "UK",
str_detect(country, "canada") ~ "Canada",
TRUE ~ "Other"
))
# extract relevant personal data so it can be tidied and reconnected
only_personal_data_2016 <- raw_candy_2016 %>%
select(`How old are you?`, `Your gender:`,`Are you going actually going trick or treating yourself?` ) %>%
rename("age" = "How old are you?",
"going_trick_or_treating" = "Are you going actually going trick or treating yourself?",
"gender" = "Your gender:") %>%
mutate(year = 2016, .before = "age")
rejoined_2016 <- bind_cols(only_personal_data_2016, country_tidied_2016) %>%
bind_cols(clean_names_candy_2016)
# ## Testing reading country using regex
# raw_candy_2016 %>%
# select(`Which country do you live in?`) %>%
# mutate(`Which country do you live in?` = tolower(`Which country do you live in?`)) %>%
# rename(country = `Which country do you live in?`) %>%
# mutate(country = case_when(
# country == NA ~ "Other",
# str_detect(country, "^us|america|states$|trump") ~ "USA",
# str_extract(country, "[u]{1}[s]{1}|america") == TRUE ~ "USA",
# str_detect(country, "kingdom$|^uk|england|scotland") ~ "UK",
# str_detect(country, "canada") ~ "Canada",
# TRUE ~ "Other"
# ))
#
# country_tidied_2016
This is essentially the same process as 2015 and 2016, but with a different pattern
only_candy_2017 <- raw_candy_2017 %>%
select(matches("^Q6"))
#This pulls out all the "candy" data (assuming of course that all the relevant data is under Q6) - But they have "Q6 | " in the column names
# This removes the "Q6 | " from all the columns
colnames(only_candy_2017) <- gsub("Q6 [\\|] ", " ",colnames(only_candy_2017))
# Clean candy names
clean_names_candy_2017 <- janitor::clean_names(only_candy_2017)
# comparing column names
#
# list_a <- names(clean_names_candy_2015)
# list_b <- names(clean_names_candy_2017)
#
# tibble(list_a[!(list_a %in% list_b)])
# tibble(list_b[!(list_b %in% list_a)])
Like with 2016, country has been left at this stage to be cleaned separately
only_personal_data_2017 <- raw_candy_2017 %>%
select(`Q3: AGE`, `Q2: GENDER`, `Q1: GOING OUT?`) %>%
rename("age" = "Q3: AGE",
"going_trick_or_treating" = "Q1: GOING OUT?",
"gender" = "Q2: GENDER") %>%
mutate(year = 2017, .before = "age")
country_tidied_2017 <- raw_candy_2017 %>%
select(`Q4: COUNTRY`) %>%
mutate(`Q4: COUNTRY` = tolower(`Q4: COUNTRY`)) %>%
rename(country = `Q4: COUNTRY`) %>%
mutate(country = case_when(
country == NA ~ "Other",
str_detect(country, "^us|america|states$") ~ "USA",
str_extract(country, "[u]{1}[s]{1}|[eu]+rica|trump") == TRUE ~ "USA",
str_detect(country, "kingdom$|^uk|england|scotland|britain") ~ "UK",
str_detect(country, "canada") ~ "Canada",
TRUE ~ "Other"
))
rejoined_2017 <- bind_cols(only_personal_data_2017, country_tidied_2017) %>%
bind_cols(clean_names_candy_2017)
rejoined_candy <- full_join(rejoined_2015, rejoined_2016) %>%
full_join(rejoined_2017)
Joining with `by = join_by(year, age, gender, country, going_trick_or_treating, butterfinger, x100_grand_bar, anonymous_brown_globs_that_come_in_black_and_orange_wrappers, any_full_sized_candy_bar, black_jacks, bottle_caps, cadbury_creme_eggs, candy_corn, vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein, candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants, cash_or_other_forms_of_legal_tender, chiclets, caramellos, snickers, dental_paraphenalia, dots, fuzzy_peaches, generic_brand_acetaminophen, glow_sticks, broken_glow_stick, goo_goo_clusters, good_n_plenty, gum_from_baseball_cards, gummy_bears_straight_up, creepy_religious_comics_chick_tracts, healthy_fruit, heath_bar, hershey_s_milk_chocolate, hugs_actual_physical_hugs, jolly_rancher_bad_flavor, jolly_ranchers_good_flavor, kale_smoothie, kinder_happy_hippo, kit_kat, hard_candy, lemon_heads, licorice_not_black, lindt_truffle, lollipops, mars, mary_janes, maynards, milk_duds, laffy_taffy, minibags_of_chips, joy_joy_mit_iodine, reggie_jackson_bar, pixy_stix, nerds, nestle_crunch, nown_laters, pencils, milky_way, reese_s_peanut_butter_cups, tolberone_something_or_other, junior_mints, senior_mints, mint_kisses, mint_juleps, peanut_m_m_s, regular_m_ms, rolos, skittles, smarties_american, smarties_commonwealth, chick_o_sticks_we_don_t_know_what_that_is, spotted_dick, starburst, swedish_fish, those_odd_marshmallow_circus_peanut_things, three_musketeers, trail_mix, twix, vicodin, white_bread, whole_wheat_anything, york_peppermint_patties, necco_wafers)`Joining with `by = join_by(year, age, gender, country, going_trick_or_treating, butterfinger, x100_grand_bar, any_full_sized_candy_bar, black_jacks, bottle_caps, cadbury_creme_eggs, candy_corn, vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein, candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants, cash_or_other_forms_of_legal_tender, chiclets, caramellos, snickers, dental_paraphenalia, dots, fuzzy_peaches, generic_brand_acetaminophen, glow_sticks, broken_glow_stick, goo_goo_clusters, good_n_plenty, gum_from_baseball_cards, gummy_bears_straight_up, creepy_religious_comics_chick_tracts, healthy_fruit, heath_bar, hershey_s_milk_chocolate, hugs_actual_physical_hugs, jolly_rancher_bad_flavor, jolly_ranchers_good_flavor, kale_smoothie, kinder_happy_hippo, kit_kat, hard_candy, lemon_heads, licorice_not_black, lindt_truffle, lollipops, mars, maynards, milk_duds, laffy_taffy, minibags_of_chips, joy_joy_mit_iodine, reggie_jackson_bar, pixy_stix, nerds, nestle_crunch, nown_laters, pencils, milky_way, reese_s_peanut_butter_cups, tolberone_something_or_other, junior_mints, senior_mints, mint_kisses, mint_juleps, peanut_m_m_s, regular_m_ms, rolos, skittles, smarties_american, smarties_commonwealth, chick_o_sticks_we_don_t_know_what_that_is, spotted_dick, starburst, swedish_fish, those_odd_marshmallow_circus_peanut_things, three_musketeers, trail_mix, twix, vicodin, white_bread, whole_wheat_anything, york_peppermint_patties, necco_wafers, bonkers_the_candy, bonkers_the_board_game, boxo_raisins, chardonnay, coffee_crisp, dove_bars, hersheys_dark_chocolate, hersheys_kisses, licorice_yes_black, mike_and_ike, blue_m_ms, red_m_ms, mr_goodbar, peeps, reeses_pieces, sourpatch_kids_i_e_abominations_of_nature, sweet_tarts, sweetums_a_friend_to_diabetes, tic_tacs, whatchamacallit_bars)`
# check to see how many NA's were present in age before mutating to integer (317)
# rejoined_candy %>%
# select(age) %>%
# filter(is.na(age)) %>%
# count()
# changing the age column to integer - this will eliminate any "non-number" answers (i.e. "enough" "fifty-four") and replace them with NA
# I have decided to leave the NA's in the data
# Logically, age cannot be over a certain value - I've decided to replace anything over 100 with NA
rejoined_candy <- rejoined_candy %>%
mutate(age = as.integer(age)) %>%
mutate(age = case_when(
age > 100 ~ NA,
age <= 100 ~ age
))
Warning: There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `age = as.integer(age)`.
Caused by warning:
! NAs introduced by coercion
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1 remaining warning.
# rejoined_candy %>%
# filter(is.na(age)) %>%
# count()
#checking gender column
unique(rejoined_candy$gender)
[1] NA "Male" "Female" "Other" "I'd rather not say"
# counting NA's to check next step has worked
# rejoined_candy %>%
# select(gender) %>%
# filter(is.na(gender)) %>%
# count()
# replacing NA's with "Not Provided"
rejoined_candy <- rejoined_candy %>%
mutate(gender = replace_na(gender, "Not Provided"))
# doing the same for country - This is technically not necessary, but i think makes it clearer why 2015 has no country data
rejoined_candy <- rejoined_candy %>%
mutate(country = replace_na(country, "Not Provided in 2015 data"))
\[][[:alnum:]]+[\]↩︎